library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
spotify <- read.csv("C:\\spotify_tracks.csv")
spotify_clean <- spotify %>%
filter(!is.na(liveness), !is.na(instrumentalness), !is.na(key), !is.na(duration_ms)) %>%
select(liveness, instrumentalness, key, duration_ms)
str(spotify_clean)
## 'data.frame': 62317 obs. of 4 variables:
## $ liveness : num 0.1 0.0951 0.0831 0.124 0.345 0.215 0.178 0.132 0.299 0.114 ...
## $ instrumentalness: num 5.53e-02 0.00 0.00 7.27e-04 1.35e-06 3.53e-02 1.35e-05 0.00 3.70e-01 1.96e-02 ...
## $ key : num 8 10 2 7 7 7 9 4 4 1 ...
## $ duration_ms : num 97297 207369 82551 115831 129621 ...
mean_liveness <- mean(spotify_clean$liveness)
median_liveness <- median(spotify_clean$liveness)
mode_liveness <- as.numeric(names(sort(table(spotify_clean$liveness), decreasing = TRUE)[1]))
sd_liveness <- sd(spotify_clean$liveness)
var_liveness <- var(spotify_clean$liveness)
mean_instrumentalness <- mean(spotify_clean$instrumentalness)
median_instrumentalness <- median(spotify_clean$instrumentalness)
mode_instrumentalness <- as.numeric(names(sort(table(spotify_clean$instrumentalness), decreasing = TRUE)[1]))
sd_instrumentalness <- sd(spotify_clean$instrumentalness)
var_instrumentalness <- var(spotify_clean$instrumentalness)
cat("Descriptive Statistics for Liveness:\n")
## Descriptive Statistics for Liveness:
cat("Mean:", mean_liveness, "Median:", median_liveness, "Mode:", mode_liveness,
"Standard Deviation:", sd_liveness, "Variance:", var_liveness, "\n")
## Mean: 0.1941425 Median: 0.125 Mode: 0.11 Standard Deviation: 0.1720304 Variance: 0.02959447
cat("Descriptive Statistics for Instrumentalness:\n")
## Descriptive Statistics for Instrumentalness:
cat("Mean:", mean_instrumentalness, "Median:", median_instrumentalness, "Mode:", mode_instrumentalness,
"Standard Deviation:", sd_instrumentalness, "Variance:", var_instrumentalness, "\n")
## Mean: 0.1462145 Median: 2.54e-05 Mode: 0 Standard Deviation: 0.3078036 Variance: 0.09474303
t_test_one_sample <- t.test(spotify_clean$liveness, mu = 0.2)
print(t_test_one_sample)
##
## One Sample t-test
##
## data: spotify_clean$liveness
## t = -8.4998, df = 62316, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.2
## 95 percent confidence interval:
## 0.1927918 0.1954932
## sample estimates:
## mean of x
## 0.1941425
spotify_clean$instrumentalness_group <- ifelse(spotify_clean$instrumentalness > 0.5, "High", "Low")
t_test_two_sample <- t.test(duration_ms ~ instrumentalness_group, data = spotify_clean)
print(t_test_two_sample)
##
## Welch Two Sample t-test
##
## data: duration_ms by instrumentalness_group
## t = -34.382, df = 12395, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group High and group Low is not equal to 0
## 95 percent confidence interval:
## -51489.90 -45935.62
## sample estimates:
## mean in group High mean in group Low
## 201528.8 250241.6
t_test_paired <- t.test(spotify_clean$instrumentalness, spotify_clean$liveness, paired = TRUE)
print(t_test_paired)
##
## Paired t-test
##
## data: spotify_clean$instrumentalness and spotify_clean$liveness
## t = -32.706, df = 62316, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.05080020 -0.04505578
## sample estimates:
## mean difference
## -0.04792799
anova_one_way <- aov(duration_ms ~ factor(key), data = spotify_clean)
summary(anova_one_way)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(key) 12 1.834e+12 1.528e+11 11.99 <2e-16 ***
## Residuals 62304 7.939e+14 1.274e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm_model <- lm(duration_ms ~ liveness, data = spotify_clean)
summary(lm_model)
##
## Call:
## lm(formula = duration_ms ~ liveness, data = spotify_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -233992 -50210 -6266 43737 4339096
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 241793.3 682.5 354.254 <2e-16 ***
## liveness 3779.6 2631.3 1.436 0.151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 113000 on 62315 degrees of freedom
## Multiple R-squared: 3.311e-05, Adjusted R-squared: 1.706e-05
## F-statistic: 2.063 on 1 and 62315 DF, p-value: 0.1509
ggplot(spotify_clean, aes(x = factor(key))) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Distribution of Musical Keys", x = "Key (0-11)", y = "Count") +
theme_minimal()

ggplot(spotify_clean, aes(x = instrumentalness, y = liveness)) +
geom_point(color = "blue") +
labs(title = "Liveness vs Instrumentalness", x = "Instrumentalness", y = "Liveness") +
theme_minimal()

plot_3d <- plot_ly(
spotify_clean,
x = ~duration_ms,
y = ~liveness,
z = ~instrumentalness,
type = "scatter3d",
mode = "markers",
marker = list(size = 3, color = ~instrumentalness, colorscale = "Viridis", opacity = 0.8)
) %>%
layout(
title = "3D Scatter Plot of Duration, Liveness, and Instrumentalness",
scene = list(
xaxis = list(title = "Duration (ms)"),
yaxis = list(title = "Liveness"),
zaxis = list(title = "Instrumentalness")
)
)
plot_3d